May 15, 2015
rvest makes scraping HTML pages super easy.
library(rvest)
# First, grab the page source
html("http://en.wikipedia.org/wiki/Table_(information)") %>%
# then extract the first node with class of wikitable
html_node(".wikitable") %>%
# then convert the HTML table into a data frame
html_table()
## First name Last name Age ## 1 Bielat Adamczak 24 ## 2 Blaszczyk Kostrzewski 25 ## 3 Olatunkboh Chijiaku 22 ## 4 Adrienne Anthoula 22 ## 5 Axelia Athanasios 22 ## 6 Jon-Kabat Zinn 22
html_table() only works on 'nicely' formatted HTML tables.This is a nice format? Really? Yes, really. It's the format used to render tables on webpages.
<table class="wikitable">
<tr>
<th>First name</th>
<th>Last name</th>
<th>Age</th>
</tr>
<tr>
<td>Bielat</td>
<td>Adamczak</td>
<td>24</td>
</tr>
<tr>
<td>Blaszczyk</td>
<td>Kostrzewski</td>
<td>25</td>
</tr>
<tr>
<td>Olatunkboh</td>
<td>Chijiaku</td>
<td>22</td>
</tr>
</table>
<table> data?domain <- "http://www.sec.gov"
susp <- file.path(domain, "litigation/suspensions.shtml")
hrefs <- html(susp) %>% html_nodes("p+ table a") %>% html_attr(name = "href")
tail(hrefs)
## [1] "/litigation/suspensions/2015/34-74326.pdf" ## [2] "/litigation/suspensions/2015/34-74326-o.pdf" ## [3] "/litigation/suspensions/2015/34-74324.pdf" ## [4] "/litigation/suspensions/2015/34-74324-o.pdf" ## [5] "/litigation/suspensions/2015/34-74218.pdf" ## [6] "/litigation/suspensions/2015/34-74218-o.pdf"
# download all the pdfs! hrefs <- hrefs[!is.na(hrefs)] pdfs <- paste0(domain, hrefs) mapply(download.file, pdfs, basename(pdfs))
GET) is usually what you want.library(httr)
me <- GET("https://api.github.com/users/cpsievert")
content(me)[c("name", "company")]
## $name ## [1] "Carson" ## ## $company ## [1] "Iowa State University"
library(rvest)
html("http://www.techstars.com/companies/stats/") %>%
html_node(".table75") %>% html_table()
Error in UseMethod("html_table") :
no applicable method for 'html_table' applied to an object of class "NULL"
Our technique from earlier doesn't work, why not?
RSelenium allows us to control any web browser we want from the R console (so we can access the DOM!!).
library(RSelenium)
pJS <- phantom()
remDr <- remoteDriver(browserName = 'phantomjs')
remDr$open()
remDr$navigate("http://www.techstars.com/companies/stats/")
src <- remDr$getPageSource()[[1]]
library(rvest)
html(src) %>% html_node(".table75") %>% html_table()
Status Number of Companies Percentage 1 Active 400 76.34% 2 Acquired 68 12.98% 3 Failed 58 11.07%
pJS$stop()
XML is a markup language that looks very similar to HTML.
<mariokart>
<driver name="Bowser" occupation="Koopa">
<vehicle speed="55" weight="25"> Wario Bike </vehicle>
<vehicle speed="40" weight="67"> Piranha Prowler </vehicle>
</driver>
<driver name="Peach" occupation="Princess">
<vehicle speed="54" weight="29"> Royal Racer </vehicle>
<vehicle speed="50" weight="34"> Wild Wing </vehicle>
</driver>
</mariokart>
XML2R is a framework to simplify acquistion of tabular/relational XML.
library(XML2R)
obs <- XML2Obs("http://bit.ly/mario-xml")
table(names(obs))
## ## mariokart//driver mariokart//driver//vehicle ## 2 4
obs # named list of observations
## $`mariokart//driver//vehicle` ## speed weight XML_value ## [1,] "55" "25" " Wario Bike " ## ## $`mariokart//driver//vehicle` ## speed weight XML_value ## [1,] "40" "67" " Piranha Prowler " ## ## $`mariokart//driver` ## name occupation ## [1,] "Bowser" "Koopa" ## ## $`mariokart//driver//vehicle` ## speed weight XML_value ## [1,] "54" "29" " Royal Racer " ## ## $`mariokart//driver//vehicle` ## speed weight XML_value ## [1,] "50" "34" " Wild Wing " ## ## $`mariokart//driver` ## name occupation ## [1,] "Peach" "Princess"
collapse_obs(obs) # group into table(s) by observational name/unit
## $`mariokart//driver` ## name occupation ## [1,] "Bowser" "Koopa" ## [2,] "Peach" "Princess" ## ## $`mariokart//driver//vehicle` ## speed weight XML_value ## [1,] "55" "25" " Wario Bike " ## [2,] "40" "67" " Piranha Prowler " ## [3,] "54" "29" " Royal Racer " ## [4,] "50" "34" " Wild Wing "
obs <- add_key(obs, parent = "mariokart//driver", recycle = "name") collapse_obs(obs)
## $`mariokart//driver` ## name occupation ## [1,] "Bowser" "Koopa" ## [2,] "Peach" "Princess" ## ## $`mariokart//driver//vehicle` ## speed weight XML_value name ## [1,] "55" "25" " Wario Bike " "Bowser" ## [2,] "40" "67" " Piranha Prowler " "Bowser" ## [3,] "54" "29" " Royal Racer " "Peach" ## [4,] "50" "34" " Wild Wing " "Peach"
Now (if I want) I can merge the tables into a single table…
tabs <- collapse_obs(obs) merge(tabs[[1]], tabs[[2]], by = "name")
## name occupation speed weight XML_value ## 1 Bowser Koopa 55 25 Wario Bike ## 2 Bowser Koopa 40 67 Piranha Prowler ## 3 Peach Princess 54 29 Royal Racer ## 4 Peach Princess 50 34 Wild Wing
[
{
"driver": "Bowser",
"occupation": "Koopa",
"vehicles": [
{
"model": "Wario Bike",
"speed": 55,
"weight": 25
},
{
"model": "Piranha Prowler",
"speed": 40,
"weight": 67
}
]
},
{
"driver": "Peach",
"occupation": "Princess",
"vehicles": [
{
"model": "Royal Racer",
"speed": 54,
"weight": 29
},
{
"model": "Wild Wing",
"speed": 50,
"weight": 34
}
]
}
]
library(jsonlite)
mario <- fromJSON("http://bit.ly/mario-json")
str(mario) # nested data.frames?!?
## 'data.frame': 2 obs. of 3 variables: ## $ driver : chr "Bowser" "Peach" ## $ occupation: chr "Koopa" "Princess" ## $ vehicles :List of 2 ## ..$ :'data.frame': 2 obs. of 3 variables: ## .. ..$ model : chr "Wario Bike" "Piranha Prowler" ## .. ..$ speed : int 55 40 ## .. ..$ weight: int 25 67 ## ..$ :'data.frame': 2 obs. of 3 variables: ## .. ..$ model : chr "Royal Racer" "Wild Wing" ## .. ..$ speed : int 54 50 ## .. ..$ weight: int 29 34
mario$driver
## [1] "Bowser" "Peach"
mario$vehicles
## [[1]] ## model speed weight ## 1 Wario Bike 55 25 ## 2 Piranha Prowler 40 67 ## ## [[2]] ## model speed weight ## 1 Royal Racer 54 29 ## 2 Wild Wing 50 34
How do we get two tables (with a common id) like the XML example?
# this mapply statement is essentially equivalent to add_key
vehicles <- mapply(function(x, y) cbind(x, driver = y),
mario$vehicles, mario$driver, SIMPLIFY = FALSE)
Reduce(rbind, vehicles)
## model speed weight driver ## 1 Wario Bike 55 25 Bowser ## 2 Piranha Prowler 40 67 Bowser ## 3 Royal Racer 54 29 Peach ## 4 Wild Wing 50 34 Peach
mario[!grepl("vehicle", names(mario))]
## driver occupation ## 1 Bowser Koopa ## 2 Peach Princess